library(lubridate)
library(tidyverse)
library(tidytext)
library(govtrackR)
library(scales)
library(ggtext)
library(gganimate)
library(hrbrthemes)
library(rtemis)
library(viridis)
library(ggrepel)
library(highcharter)
library(tidylo)
library(widyr)
library(gt)
library(tidygraph)
library(ggraph)
library(igraph)
library(d3r)
library(treemap)
library(sunburstR)
library(reactable)
library(skimr)
library(trelliscopejs)
library(glue)
options(highcharter.theme = hc_theme_hcrt(tooltip = list(valueDecimals = 2)))
Lets explore PSC’s to help us better understand the products and services the government procures.
tbl_psc <- dictionary_psc_active(only_active = T, snake_names = T)
Lets build an an interactive table that lets us see all the PSCs
psc_tbl <- tbl_psc %>%
select(
is_active_psc,
type_psc,
name_solicitation_group,
code_product_service,
name_product_service,
date_start,
date_end,
details_product_service_includes
) %>%
reactable(
filterable = T,
resizable = T,
searchable = T,
showPageSizeOptions = T,
defaultPageSize = 4,
pageSizeOptions = c(5, 10, 20),
sortable = T,
compact = T
)
Here it is
psc_tbl
What is the breakdown between these two groups?
gg_psc_bkd <-
tbl_psc %>%
count(type_psc, sort = T, name = "count") %>%
mutate(type_psc = fct_reorder(type_psc, count)) %>%
ggplot(aes(x = type_psc, y = count, fill = type_psc)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_viridis(discrete = TRUE, name = "") +
theme_ipsum() +
ylab("Number of Product Service Codes") +
xlab("") +
ggtitle("Products Versus Services") +
coord_flip()
gg_psc_bkd
How often are new PSC’s added?
First, how many new PSC’s are there on a given date?
tbl_new_codes <-
tbl_psc %>%
group_by(type_psc, date_start) %>%
summarise(count_added = n(), .groups = "drop")
Lets explore this interactively
hc_new_psc <-
hchart(tbl_new_codes,
"line",
hcaes(x = date_start, y = count_added, group = type_psc)) %>%
hc_title(text = "New Product Service Codes by Date Added") %>%
hc_yAxis(title = list(text = "Count of New Codes")) %>%
hc_xAxis(title = list(text = "Date Added"))
hc_new_psc
Lets make this look a bit better
hc_new_psc <-
hc_new_psc %>%
hc_add_theme(hc_theme_hcrt())
hc_new_psc
Lets take a look at the 10 newest PSCs
tbl_10_new_psc <-
tbl_psc %>%
filter(!is_parent_psc) %>%
arrange(desc(date_start)) %>%
group_by(type_psc) %>%
slice(1:10) %>%
ungroup()
gt(tbl_10_new_psc)
| is_parent_psc | is_active_psc | type_psc | id_solicitation_group | name_solicitation_group | group_product_service | code_product_service | name_product_service | name_psc | details_psc | date_start | date_end | name_product_service_full | details_product_service_excludes | details_product_service_notes | details_product_service_includes | count_items_included | count_items_excluded | count_days_active |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| FALSE | TRUE | PRODUCT | 15 | AEROSPACE CRAFT AND STRUCTURAL COMPONENTS | 155 | 1555 | SPACE VEHICLES | SPACE VEHICLES | NA | 2015-10-01 | NA | NA | NA | THIS CLASS INCLUDES ONLY COMPLETE SPACE VEHICLES IN ASSEMBLED OR UNASSEMBLED FORM | NA | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 16 | AEROSPACE CRAFT COMPONENTS AND ACCESSORIES | 167 | 1675 | SPACE VEHICLE COMPONENTS | SPACE VEHICLE COMPONENTS | NA | 2015-10-01 | NA | SPACE VEHICLE COMPONENTS | REMOTE GUIDANCE EQUIPMENT | NA | STRUCTURAL COMPONENTS; COMPONENTS AND ACCESSORIES SPECIALLY DESIGNED FOR INSTALLATION IN OR ON SPACE VEHICLES; INTERNAL CONTROL SYSTEMS | 2 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 16 | AEROSPACE CRAFT COMPONENTS AND ACCESSORIES | 167 | 1677 | SPACE VEHICLE REMOTE CONTROL SYSTEMS | SPACE VEHICLE REMOTE CONTROL SYSTEMS | NA | 2015-10-01 | NA | SPACE VEHICLE REMOTE CONTROL SYSTEMS | INTERNAL CONTROL SYSTEMS; COMPONENTS DESIGNED FOR USE WITH BOTH GUIDED MISSILE AND SPACE VEHICLE SYSTEMS | NA | SPECIFICALLY DESIGNED COMPONENTS OF SPACE VEHICLE REMOTE CONTROL SYSTEMS | 0 | 1 | 1862 |
| FALSE | TRUE | PRODUCT | 17 | AEROSPACE CRAFT LAUNCHING, LANDING, GROUND HANDLING AND SERVICING EQUIPMENT | 172 | 1725 | SPACE VEHICLE LAUNCHERS | SPACE VEHICLE LAUNCHERS | NA | 2015-10-01 | NA | SPACE VEHICLE LAUNCHERS | LAUNCHERS USED WITH BOTH GUIDED MISSILE AND SPACE VEHICLES | NA | LAUNCHERS SPECIFICALLY DESIGNED FOR SPACE VEHICLES | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 17 | AEROSPACE CRAFT LAUNCHING, LANDING, GROUND HANDLING AND SERVICING EQUIPMENT | 173 | 1735 | SPACE VEHICLE HANDLING AND SERVICING EQUIPMENT | SPACE VEHICLE HANDLING AND SERVICING EQUIPMENT | NA | 2015-10-01 | NA | SPACE VEHICLE HANDLING AND SERVICING EQUIPMENT | EQUIPMENT USED IN HANDLING OR SERVICING BOTH GUIDED MISSILES AND SPACE VEHICLES | THIS CLASS DOES NOT INCLUDE SPACE VEHICLE AERIAL RECOVERY SYSTEMS WHICH ARE CLASSIFIED IN CLASS 1670 | SPECIALLY DESIGNED TRUCKS AND TRAILERS FOR SUE IN TRANSPORTING SPACE VEHICLES; SPECIALLY DESIGNED SLINGS, HOISTS, JACKS, BLOWERS, SELF-PROPELLED VEHICLES, SPECIALLY DESIGNED FOR SPACE VEHICLE HANDLING OR SERVICING; COVERS, SPACE VEHICLE | 2 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 70 | INFORMATION TECHNOLOGY EQUIPMENT SOFTWARE,SUPPLIES& SUPPORT EQUIPMENT | 701 | 7010 | INFORMATION TECHNOLOGY EQUIPMENT SYSTEM CONFIGURATION | INFORMATION TECHNOLOGY EQUIPMENT SYSTEM CONFIGURATION | NA | 2015-10-01 | NA | INFORMATION TECHNOLOGY EQUIPMENT SYSTEM CONFIGURATION | NA | A GROUP OF GENERAL PURPOSE ANALOG, DIGITAL, OR HYBRID ELECTRONIC OR ELECTROMECHANICAL DEVICES THAT ARE INTERCONNECTED TO OPERATE AS A SYSTEM FREQUENTLY REFERRED TO AS AN INFORMATION TECHNOLOGY SYSTEM OR AUTOMATED DATA PROCESSING SYSTEM ALTHOUGH THE TERM SYSTEM IS NOT CONSIDERED DEFINITIVE INCLUDES AN ASSEMBLY OF DEVICES CONSISTING OF A CENTRAL PROCESSING UNIT AND THE NECESSARY INPUT/OUTPUT DEVICES, ACCESSORIAL DEVICES, ANALOG MEASUREMENT DEVICES, SOFTWARE AND/OR FIRMWARE REQUIRED TO PERFORM THE DESIRED OBJECTIVES ALSO INCLUDES SUB-ASSEMBLIES AND UNITS IN WHICH INPUT/OUTPUT DEVICES, CPUS, AND/OR ACCESSORIAL DEVICES OR COMPONENTS ARE INCORPORATED INTO A SINGLE ASSEMBLY OR UNIT A SPECIALLY DESIGNED DEVICE INCORPORATED INTO A SYSTEM SHALL NOT INFLUENCE THE CLASSIFICATION OF THE BASE ASSEMBLY OR UNIT | NA | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 70 | INFORMATION TECHNOLOGY EQUIPMENT SOFTWARE,SUPPLIES& SUPPORT EQUIPMENT | 702 | 7020 | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , ANALOG | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , ANALOG | NA | 2015-10-01 | NA | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , ANALOG | NA | AN ANALOG IS A REPRESENTATION IN ONE FORM OF A PHYSICAL CONDITION EXISTING IN ANOTHER FORM THIS CLASS INCLUDES ONLY CPUS THAT ACCEPT AS INPUTS THE ELECTRICAL EQUIVALENT OF PHYSICAL CONDITIONS SUCH AS FLOW, TEMPERATURE, PRESSURE, ANGULAR POSITION OR VOLTAGE AND PERFORM COMPUTATIONS BY MANIPULATING THESE ELECTRICAL EQUIVALENTS TO PRODUCE RESULTS FOR FURTHER USE | NA | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 70 | INFORMATION TECHNOLOGY EQUIPMENT SOFTWARE,SUPPLIES& SUPPORT EQUIPMENT | 702 | 7021 | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , DIGITAL | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , DIGITAL | NA | 2015-10-01 | NA | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , DIGITAL | NA | DIGITAL REFERS TO THE REPRESENTATION OF DISCRETE NUMBERS, SYMBOLS AND ALPHABETIC CHARACTERS BY A PREDETERMINED, CODED COMBINATION OF ELECTRICAL IMPULSES THIS CLASS INCLUDES ONLY CPUS THAT ACCEPT INFORMATION REPRESENTED BY DIGITAL IMPULSES SPECIFICALLY, A DEVICE CAPABLE OF PERFORMING SEQUENCES OF ARITHMETIC AND LOGIC OPERATIONS NOT ONLY ON DATA BUT ALSO ON THE PROGRAM WHICH IS CONTAINED IN ITS INTERNAL MEMORY WITHOUT INTERVENTION OF AN OPERATOR | NA | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 70 | INFORMATION TECHNOLOGY EQUIPMENT SOFTWARE,SUPPLIES& SUPPORT EQUIPMENT | 702 | 7022 | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , HYBRID | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , HYBRID | NA | 2015-10-01 | NA | INFORMATION TECHNOLOGY CENTRAL PROCESSING UNIT , HYBRID | NA | HYBRID REFERS TO A COMBINATION OF ANALOG AND DIGITAL CAPABILITY AS DEFINED IN CLASSES 7020 AND 7021 WITH CONVERSION CAPABILITY REQUIRED FOR INTERCOMMUNICATION | NA | 0 | 0 | 1862 |
| FALSE | TRUE | PRODUCT | 70 | INFORMATION TECHNOLOGY EQUIPMENT SOFTWARE,SUPPLIES& SUPPORT EQUIPMENT | 702 | 7025 | INFORMATION TECHNOLOGY INPUT/OUTPUT AND STORAGE DEVICES | INFORMATION TECHNOLOGY INPUT/OUTPUT AND STORAGE DEVICES | NA | 2015-10-01 | NA | INFORMATION TECHNOLOGY INPUT/OUTPUT AND STORAGE DEVICES | NA | THIS CLASS INCLUDES DEVICES USED TO CONTROL TRANSFER INFORMATION TO AND FROM A COMPUTER THE INPUT DEVICE IS USED FOR TRANSFERRING DATA AND INSTRUCTIONS INTO A COMPUTER THE OUTPUT DEVICE IS USED TO TRANSFER RESULTS OF PROCESSING BY THE COMPUTER TO THE INFORMATION TECHNOLOGY OR ADP PERIPHERAL DEVICES INPUT/OUTPUT DEVICES COMBINE THE ABOVE FUNCTIONS IN THE SAME DEVICE THIS CLASS INCLUDES PRINTERS, DISPLAY UNITS, DISK DRIVE UNITS , TAPE DRIVE UNITS, TERMINALS, DATA ENTRY DEVICES AND TRANSFER UNITS ALSO INCLUDES OPTICAL COMPACT DISK DEVICES USED FOR THE STORAGE AND RETRIEVAL OF DATA AND FIRMWARE | NA | 0 | 0 | 1862 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AA | HUSBANDING SERVICES, COMMUNICATIONS SERVICES | HUSBANDING SERVICES, COMMUNICATIONS SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, COMMUNICATIONS SERVICES | NA | NA | INCLUDES LANDLINES, MOBILE PHONES, AND SIM CARDS | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AB | HUSBANDING SERVICES, FORCE PROTECTION | HUSBANDING SERVICES, FORCE PROTECTION | NA | 2020-03-17 | NA | HUSBANDING SERVICES, FORCE PROTECTION | NA | NA | INCLUDES SECURITY GUARDS, DEMARCATION FLOATING PERIMETERS, PICKET BOATS, METAL DETECTORS, X-RAY MACHINE, EOD DIVERS, AND BARRIERS | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AC | HUSBANDING SERVICES, REMOVAL SERVICES | HUSBANDING SERVICES, REMOVAL SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, REMOVAL SERVICES | NA | NA | INCLUDES SEWAGE, OILY WASTE AND TRASH | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AD | HUSBANDING SERVICES, MATERIAL HANDLING | HUSBANDING SERVICES, MATERIAL HANDLING | NA | 2020-03-17 | NA | HUSBANDING SERVICES, MATERIAL HANDLING | NA | NA | INCLUDES CUSTOMS CLEARING, RECORDING, EXPORTING, TRANSPORTATION, AND MAIL DELIVERY | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AE | HUSBANDING SERVICES, PURCHASING SERVICES | HUSBANDING SERVICES, PURCHASING SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, PURCHASING SERVICES | NA | NA | INCLUDES INCIDENTALS, LOCAL PROCUREMENT, AND PROVISIONS | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2A | M2AF | HUSBANDING SERVICES, INCIDENTAL SERVICES | HUSBANDING SERVICES, INCIDENTAL SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, INCIDENTAL SERVICES | NA | NA | INCLUDES MONEY EXCHANGE, LAUNDRY, INTERPRETER SERVICES, PAINT LIGHTER, TENTS | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2B | M2BA | HUSBANDING SERVICES, TRANSPORTATION SERVICES | HUSBANDING SERVICES, TRANSPORTATION SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, TRANSPORTATION SERVICES | NA | NA | INCLUDES CARS, MINI VANS/BUSES, WATER TAXIS & BOATS, CARGO VANS & TRUCKS, NAVIGATION | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2B | M2BB | HUSBANDING SERVICES, FUEL SERVICES | HUSBANDING SERVICES, FUEL SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, FUEL SERVICES | NA | NA | INCLUDES FUEL HANDLING, DELIVERY, AND LUBRICANTS | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2B | M2BZ | HUSBANDING SERVICES, OTHER PORT SERVICES | HUSBANDING SERVICES, OTHER PORT SERVICES | NA | 2020-03-17 | NA | HUSBANDING SERVICES, OTHER PORT SERVICES | NA | NA | INCLUDES PILOTS, TUGS, BROWS/FENDERS, BARGES, BERTHING, LINE HANDLERS, PORTABLE WATER, CRANES, FORKLIFTS, CUSTOMS AND IMMIGRATION, QUARANTINE, PORT ENTRY, CLEARANCE, AND FLEET LANDING | 0 | 0 | 233 |
| FALSE | TRUE | SERVICE | M | OPERATION OF GOVT OWNED FACILITY | M2C | M2CA | SHIP HUSBANDING SERVICES, MANAGEMENT/INTEGRATION SERVICE | SHIP HUSBANDING SERVICES, MANAGEMENT/INTEGRATION SERVICE | NA | 2020-03-17 | NA | SHIP HUSBANDING SERVICES, MANAGEMENT/INTEGRATION SERVICE | NA | NA | INCLUDES MANAGEMENT BY A HUSBANDING SERVICES PROVIDER OR HUSBANDING SERVICE AGENT TO PROVIDE AN INTEGRATED MULTI-SERVICES SOLUTION FOR PORT SERVICES | 0 | 0 | 233 |
Lets explore the connections of these 20 newest PSCs
hc_new_psc_node_graph <-
tbl_10_new_psc %>%
select(name_solicitation_group, name_product_service) %>%
as_tbl_graph() %>%
hchart() %>%
hc_title(text = "New Product Service Codes Node Graph") %>%
hc_add_theme(hc_theme_darkunica()) %>%
hc_xAxis(visible = FALSE) %>%
hc_yAxis(visible = FALSE)
hc_new_psc_node_graph
This section explores fpds_csv which provides real time access into the FPDS csv interface.
args(fpds_csv)
tbl_pyro <-
fpds_csv(product_or_service_code = "1370", snake_names = T)
This is a great practice for exploring data
glimpse(tbl_pyro)
Rows: 6,118 Columns: 43 $ id_contract
skim(tbl_pyro)
| Name | tbl_pyro |
| Number of rows | 6118 |
| Number of columns | 43 |
| _______________________ | |
| Column type frequency: | |
| character | 29 |
| Date | 2 |
| logical | 4 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| id_contract | 0 | 1.00 | 9 | 34 | 0 | 2477 | 0 |
| id_contract_analysis | 0 | 1.00 | 9 | 34 | 0 | 2475 | 0 |
| type_procurement | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
| id_contract_idv | 3320 | 0.46 | 9 | 15 | 0 | 510 | 0 |
| code_modification | 0 | 1.00 | 1 | 21 | 0 | 222 | 0 |
| type_award | 0 | 1.00 | 17 | 51 | 0 | 10 | 0 |
| id_agency_award | 0 | 1.00 | 4 | 4 | 0 | 30 | 0 |
| name_agency_award | 112 | 0.98 | 14 | 45 | 0 | 28 | 0 |
| name_office_award | 23 | 1.00 | 3 | 53 | 0 | 254 | 0 |
| type_product_or_service | 0 | 1.00 | 1 | 1 | 0 | 1 | 0 |
| code_product_service | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| name_naics | 1548 | 0.75 | 7 | 118 | 0 | 103 | 0 |
| name_vendor | 0 | 1.00 | 4 | 63 | 0 | 623 | 0 |
| city_vendor | 190 | 0.97 | 4 | 18 | 0 | 396 | 0 |
| code_state_vendor | 389 | 0.94 | 1 | 9 | 0 | 57 | 0 |
| zipcode_vendor | 225 | 0.96 | 4 | 5 | 0 | 473 | 0 |
| name_vendor_parent | 224 | 0.96 | 4 | 52 | 0 | 432 | 0 |
| code_additional_reporting | 5817 | 0.05 | 1 | 4 | 0 | 2 | 0 |
| description_additonal_reporting | 5817 | 0.05 | 17 | 45 | 0 | 2 | 0 |
| url_csv | 0 | 1.00 | 169 | 169 | 0 | 1 | 0 |
| id_department_award | 343 | 0.94 | 4 | 4 | 0 | 10 | 0 |
| name_department_award | 0 | 1.00 | 19 | 45 | 0 | 11 | 0 |
| type_contract_id_analysis | 0 | 1.00 | 3 | 8 | 0 | 2 | 0 |
| name_agency_cgac_award | 0 | 1.00 | 19 | 45 | 0 | 14 | 0 |
| slug_cgac_award | 0 | 1.00 | 3 | 3 | 0 | 14 | 0 |
| type_psc | 0 | 1.00 | 7 | 7 | 0 | 1 | 0 |
| name_product_service | 0 | 1.00 | 12 | 12 | 0 | 1 | 0 |
| id_solicitation_group | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| name_solicitation_group | 0 | 1.00 | 25 | 25 | 0 | 1 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date_obligation | 0 | 1.00 | 1978-10-15 | 2020-09-23 | 2008-11-25 | 2942 |
| date_solicitation | 5934 | 0.03 | 2013-04-11 | 2020-08-23 | 2018-08-24 | 54 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| has_duns_parent | 0 | 1.00 | 0.97 | TRU: 5938, FAL: 180 |
| is_missing_duns | 0 | 1.00 | 0.03 | FAL: 5938, TRU: 180 |
| has_parent | 180 | 0.97 | 0.47 | FAL: 3143, TRU: 2795 |
| is_idv | 0 | 1.00 | 0.46 | FAL: 3320, TRU: 2798 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| number_transaction | 0 | 1.00 | 0.44 | 2.01 | 0 | 0 | 0.00 | 0 | 17 | ▇▁▁▁▁ |
| amount_obligation | 0 | 1.00 | 841978.71 | 3477601.15 | -37099346 | 0 | 6699.57 | 145000 | 77351610 | ▁▇▁▁▁ |
| id_naics | 1543 | 0.75 | 340155.69 | 69999.03 | 114210 | 325998 | 325998.00 | 332993 | 926140 | ▁▇▁▁▁ |
| id_duns | 180 | 0.97 | 186291076.98 | 278747509.25 | 1007595 | 7031479 | 52814121.00 | 151954310 | 999195803 | ▇▁▁▁▁ |
| id_duns_parent | 180 | 0.97 | 248556355.70 | 275090104.48 | 1007595 | 43190826 | 149528882.00 | 217304393 | 999195803 | ▇▃▁▂▁ |
| year_fiscal_obligation | 0 | 1.00 | 2005.76 | 10.44 | 1978 | 2001 | 2008.00 | 2013 | 2020 | ▂▂▂▇▇ |
| code_department_award | 343 | 0.94 | 90.59 | 19.99 | 12 | 97 | 97.00 | 97 | 97 | ▁▁▁▁▇ |
| id_cgac_award | 0 | 1.00 | 28.27 | 19.99 | 12 | 17 | 21.00 | 21 | 97 | ▇▁▁▁▁ |
First thing to look at is how much we have spent on these products.
tbl_by_day <-
tbl_pyro %>%
group_by(date_obligation) %>%
summarise(amount = sum(amount_obligation), .groups = "drop")
Lets take a look at the data
tbl_by_day %>% sample_n(3) %>% munge_data() %>% gt()
| date_obligation | amount |
|---|---|
| 2011-02-24 | $3,915,256 |
| 2011-03-29 | $0 |
| 2011-11-23 | $0 |
tbl_by_day %>% filter(amount == 0) %>% nrow()
[1] 879
Lets filter them out
tbl_by_day <- tbl_by_day %>% filter(amount != 0)
hc_by_day <-
tbl_by_day %>%
hchart("spline",
hcaes(x = date_obligation, y = amount)) %>%
hc_title(text = "Pyrotechnic Spend By Day Spend by Date") %>%
hc_yAxis(title = list(text = "Amount Obligated")) %>%
hc_xAxis(title = list(text = "Date Added")) %>%
hc_add_theme(hc_theme_elementary())
Lets add a cumulative total
tbl_by_day <-
tbl_by_day %>%
mutate(amount_cumulative = cumsum(amount))
Now lets explore it statically
gg_area_pyro <-
tbl_by_day %>%
ggplot(aes(date_obligation, amount_cumulative)) +
geom_area(fill = "#22908C", alpha = .5) +
scale_fill_viridis(discrete = TRUE) +
scale_y_continuous(labels = scales::dollar) +
scale_x_date() +
theme(legend.position = "none") +
theme_ipsum() +
labs(title = "Cumulative Federal Procurement on Pyrotechnics", x = "Date", y = "Cumulative Procurement Spend")
gg_area_pyro
Lets add some model fits
gg_area_pyro <-
gg_area_pyro +
geom_smooth(method = "lm") +
geom_smooth(method = "loess", color = "black")
gg_area_pyro
Often products and services tend to be procured in September at the end of the budget year, does this product exhibit that trend?
Lets build a table that gives us the inputs
govt_months <-
c(
"Oct",
"Nov",
"Dec",
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep"
)
tbl_monthly_pyro <-
tbl_pyro %>%
mutate(month_obligation = lubridate::month(date_obligation, label = T)) %>%
count(year_fiscal_obligation,
month_obligation,
wt = amount_obligation,
name = "amount")
## Set the factor levels to budget months
tbl_monthly_pyro <-
tbl_monthly_pyro %>%
mutate(month_obligation = factor(
month_obligation,
levels = govt_months,
ordered = T
))
Lets turn this into an interactive heatmap.
fntltp <- JS("function(){
return this.point.x + ' ' + this.series.yAxis.categories[this.point.y] + ': ' +
Highcharts.numberFormat(this.point.value, 2);
}")
hc_pyro_hm <-
hchart(
tbl_monthly_pyro,
"heatmap",
hcaes(x = year_fiscal_obligation,
y = month_obligation,
value = amount)
) %>%
hc_colorAxis(
stops = color_stops(20, colors = scales::viridis_pal(option = "B")(20)),
# fuerza a utilzar mejor el espectro de colores para que HJS no amplie el
# eje para tener numero "redondos
startOnTick = FALSE,
endOnTick = FALSE,
reversed = T
) %>%
hc_yAxis(
title = list(text = ""),
reversed = TRUE,
offset = -20,
tickLength = 0,
gridLineWidth = 0,
minorGridLineWidth = 0,
labels = list(style = list(fontSize = "9px"))
) %>%
hc_tooltip(formatter = fntltp) %>%
hc_title(text = "Spend by Month and Fiscal Year") %>%
hc_legend(
layout = "horizontal",
verticalAlign = "top",
align = "left",
valueDecimals = 0
) %>%
hc_add_theme(hc_theme_darkunica())
hc_pyro_hm
Looks like it fits the usual trend but can we actually quantify it?
Lets do a basic liner model exploring this.
mod <-
rtemis::s.LM(x = tbl_monthly_pyro$month_obligation, y = tbl_monthly_pyro$amount, intercept = F)
[2020-11-05 16:14:51 rtemis::s.LM] Hello, alexbresler
[[ Regression Input Summary ]] Training features: 497 x 1 Training outcome: 497 x 1 Testing features: Not available Testing outcome: Not available
[2020-11-05 16:14:51 rtemis::s.LM] Training linear model…
[[ LM Regression Training Summary ]] MSE = 2.7e+14 (8.55%) RMSE = 1.6e+07 (4.37%) MAE = 1e+07 (4.84%) r = 0.29 (p = 3e-11) rho = 0.33 (p = 4.1e-14) R sq = 0.09 [2020-11-05 16:14:51 rtemis::s.LM] Run completed in 0.01 minutes (Real: 0.61; User: 0.38; System: 0.08)
Now lets explore the variable importance!
var_imp <- mod$varimp
tbl_coef <-
tibble(month = names(var_imp), amount_coef = var_imp) %>%
arrange(desc(amount_coef)) %>%
munge_data()
Let’s see
gt(tbl_coef)
| month | amount_coef |
|---|---|
| XSEP | $21,731,668 |
| XMAR | $18,487,441 |
| XAPR | $12,047,660 |
| XJUN | $11,315,741 |
| XAUG | $10,117,001 |
| XMAY | $9,738,273 |
| XFEB | $9,336,068 |
| XJUL | $8,661,087 |
| XJAN | $7,656,582 |
| XDEC | $6,705,743 |
| XNOV | $4,033,896 |
Lets take a look at spend by department.
tbl_depts <-
tbl_pyro %>%
count(
name_department_award,
wt = amount_obligation,
name = "amount",
sort = T
) %>%
mutate(name_department_award = fct_reorder(name_department_award, amount))
tbl_depts %>% munge_data() %>% gt()
| name_department_award | amount |
|---|---|
| DEPARTMENT OF DEFENSE | $5,112,237,946 |
| NATIONAL AERONAUTICS AND SPACE ADMINISTRATION | $23,919,053 |
| GENERAL SERVICES ADMINISTRATION | $7,093,773 |
| DEPARTMENT OF THE INTERIOR | $3,036,645 |
| DEPARTMENT OF AGRICULTURE | $2,489,224 |
| DEPARTMENT OF TRANSPORTATION | $995,000 |
| DEPARTMENT OF STATE | $662,286 |
| DEPARTMENT OF ENERGY | $293,108 |
| DEPARTMENT OF HOMELAND SECURITY | $241,504 |
| DEPARTMENT OF JUSTICE | $213,213 |
| DEPARTMENT OF VETERANS AFFAIRS | $44,000 |
Now we can visualize it.
gg_spend <-
tbl_depts %>%
mutate(amount_millions = amount /1000000) %>%
ggplot(aes(x = name_department_award, y = amount_millions)) +
geom_segment(
aes(
x = name_department_award ,
xend = name_department_award,
y = 0,
yend = amount_millions
),
color = "grey"
) +
geom_point(size = 3, color = "#69b3a2") +
coord_flip() +
theme_ipsum() +
theme(
legend.position = "none",
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
text = element_text(size = 7)
) +
xlab("") +
scale_y_log10(labels = scales::dollar, n.breaks = 10) +
labs(title = "Which Departments Purchased the Explosives?",
x = "",
y = "Procurement $ in milions (log10 transformed)")
gg_spend
Now lets look at contract size against agency group.
tbl_pyro %>%
filter(amount_obligation > 0) %>% count(name_agency_cgac_award,
wt = amount_obligation,
sort = T,
name = "amount") %>%
munge_data() %>%
gt()
| name_agency_cgac_award | amount |
|---|---|
| DEPARTMENT OF THE ARMY | $3,157,566,337 |
| DEPARTMENT OF THE NAVY | $1,288,027,486 |
| DEPARTMENT OF THE AIR FORCE | $730,536,689 |
| DEPARTMENT OF DEFENSE | $89,065,356 |
| NATIONAL AERONAUTICS AND SPACE ADMINISTRATION | $25,393,388 |
| GENERAL SERVICES ADMINISTRATION | $7,093,773 |
| DEPARTMENT OF THE INTERIOR | $3,037,345 |
| DEPARTMENT OF AGRICULTURE | $2,514,745 |
| DEPARTMENT OF TRANSPORTATION | $995,000 |
| DEPARTMENT OF STATE | $662,738 |
| DEPARTMENT OF ENERGY | $293,108 |
| DEPARTMENT OF HOMELAND SECURITY | $242,212 |
| DEPARTMENT OF JUSTICE | $219,532 |
| DEPARTMENT OF VETERANS AFFAIRS | $44,000 |
Looks like it is really skewed to a few agencies. Perfect situation to lump some of the groups
tbl_agency_sum <-
tbl_pyro %>%
filter(amount_obligation > 0) %>%
mutate(
name_agency_cgac_award = name_agency_cgac_award %>% fct_lump(8, w = amount_obligation, other_level = "OTHER 6 AGENCIES")
) %>%
group_by(name_agency_cgac_award, id_contract_analysis) %>%
summarise(amount = sum(amount_obligation, na.rm = T)) %>%
ungroup() %>%
filter(amount > 1000) %>%
mutate(name_agency_cgac_award = fct_reorder(name_agency_cgac_award, amount))
Now we can explore the range using a boxplot and even layer in the volume with a scatter plot!
gg_contract <-
tbl_agency_sum %>%
ggplot(aes(
x = factor(name_agency_cgac_award),
y = amount,
fill = name_agency_cgac_award
)) +
geom_boxplot() +
geom_jitter(color = "black",
size = 0.3,
alpha = .5) +
scale_fill_viridis(discrete = TRUE, alpha = .5) +
theme_ipsum() +
scale_y_log10(labels = scales::dollar) +
theme(legend.position = "none",
plot.title = element_text(size = 11)) +
labs(
title = "Distrubition of Award Size by Agency Group",
x = "",
y = "",
credits = "Awards over $1000"
) +
coord_flip()
gg_contract
Next lets see how the government procures this product and visualize it in a treemap.
Lets set the treemap parameters
lvl_opts <- list(
list(
level = 1,
borderWidth = 0,
borderColor = "transparent",
dataLabels = list(
enabled = TRUE,
align = "left",
verticalAlign = "top",
style = list(
fontSize = "12px",
textOutline = FALSE,
color = "white"
)
)
),
list(
level = 2,
borderWidth = 0,
borderColor = "transparent",
colorVariation = list(key = "brightness", to = 0.250),
dataLabels = list(enabled = T),
style = list(
fontSize = "8px",
textOutline = FALSE,
color = "white"
)
)
)
Now we can build the treemap
hc_treemap_spend <-
tbl_pyro %>%
count(name_department_award, type_award, sort = T) %>%
highcharter::data_to_hierarchical(
group_vars = c("name_department_award", "type_award"),
size_var = "n"
) %>%
hchart(
type = "treemap",
levels = lvl_opts,
tooltip = list(valueDecimals = FALSE)
) %>%
hc_add_theme(hc_theme_superheroes())
hc_treemap_spend
Next lets take a look at the companies that supply this product.
First thing we want to do is build a summary table that lets us explore the data.
tbl_vendors <-
tbl_pyro %>%
group_by(id_duns) %>%
summarise(
name_vendor = name_vendor[which.max(amount_obligation)],
date_first_award = min(date_obligation, na.rm = T),
date_recent_award = max(date_obligation, na.rm = T),
count_actions = n(),
count_contracts = n_distinct(id_contract_analysis, na.rm = T),
amount_contracts = sum(amount_obligation),
count_departments = n_distinct(name_department_award, na.rm = T),
count_agencies = n_distinct(name_agency_cgac_award, na.rm = T)
) %>%
arrange(desc(amount_contracts))
Who are the top 10 vendors?
tbl_vendors %>%
slice(1:10) %>%
munge_data() %>%
gt()
| id_duns | name_vendor | date_first_award | date_recent_award | count_actions | count_contracts | amount_contracts | count_departments | count_agencies |
|---|---|---|---|---|---|---|---|---|
| 2341824 | ALLOY SURFACES COMPANY, INC. | 1990-09-15 | 2020-06-17 | 543 | 76 | $1,136,321,359 | 1 | 4 |
| 128342156 | ARMTEC COUNTERMEASURES COMPANY | 2003-09-18 | 2020-07-23 | 499 | 44 | $691,846,016 | 1 | 4 |
| 18243985 | KILGORE FLARES COMPANY LLC | 2007-03-30 | 2020-07-14 | 419 | 26 | $621,649,431 | 1 | 4 |
| 824862254 | KILGORE FLARES COMPANY LLC | 1995-06-15 | 2009-03-23 | 160 | 35 | $414,372,086 | 1 | 3 |
| 2241164 | ATK THIOKOL INC | 1997-06-02 | 2018-09-27 | 186 | 26 | $317,356,699 | 1 | 4 |
| 7031479 | SECURITY SIGNALS, INC | 1981-12-15 | 2020-09-23 | 329 | 55 | $264,515,231 | 1 | 2 |
| 611068453 | ARMTEC DEFENSE PRODUCTS CO INC | 2002-10-04 | 2020-07-30 | 112 | 9 | $221,700,572 | 1 | 3 |
| 52814121 | PYROTECHNIQUE BY GRUCCI, INC | 2002-08-08 | 2020-05-20 | 246 | 9 | $173,039,332 | 1 | 1 |
| 47966593 | MARTIN ELECTRONICS INC | 1982-07-15 | 2020-06-30 | 237 | 59 | $136,776,436 | 1 | 4 |
| 7020159 | KILGORE CORPORATION | 1980-11-15 | 1995-03-15 | 75 | 74 | $125,514,000 | 2 | 4 |
Lets skim the data
skim(tbl_vendors)
| Name | tbl_vendors |
| Number of rows | 499 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 2 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name_vendor | 0 | 1 | 4 | 63 | 0 | 463 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date_first_award | 0 | 1 | 1978-10-15 | 2020-08-12 | 2002-04-17 | 400 |
| date_recent_award | 0 | 1 | 1979-12-15 | 2020-09-23 | 2005-08-30 | 429 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id_duns | 1 | 1 | 218034258.18 | 292070392.45 | 1007595 | 27759418 | 86011125.5 | 193993510.8 | 999195803 | ▇▁▁▁▁ |
| count_actions | 0 | 1 | 12.26 | 47.50 | 1 | 1 | 2.0 | 5.5 | 543 | ▇▁▁▁▁ |
| count_contracts | 0 | 1 | 5.00 | 15.40 | 1 | 1 | 1.0 | 3.0 | 249 | ▇▁▁▁▁ |
| amount_contracts | 0 | 1 | 10323097.70 | 72319422.98 | -444000 | 20794 | 82407.2 | 455000.0 | 1136321359 | ▇▁▁▁▁ |
| count_departments | 0 | 1 | 1.08 | 0.32 | 1 | 1 | 1.0 | 1.0 | 4 | ▇▁▁▁▁ |
| count_agencies | 0 | 1 | 1.25 | 0.60 | 1 | 1 | 1.0 | 1.0 | 6 | ▇▁▁▁▁ |
Lets filter our data to include only vendors with over $500,000 in obligations.
tbl_vendors <- tbl_vendors %>% filter(amount_contracts > 500000)
Now lets take a look at the top vendors visually.
gg_top_vendors_pyro <-
tbl_vendors %>%
filter(amount_contracts > 0) %>%
mutate(
name_vendor_lumped = name_vendor %>% fct_lump(10, w = amount_contracts, other_level = "ALL OTHER VENDORS")
) %>%
count(name_vendor_lumped, wt = amount_contracts, name = "amount") %>%
mutate(name_vendor_lumped = fct_reorder(name_vendor_lumped, amount)) %>%
ggplot(aes(x = name_vendor_lumped, y = amount)) +
geom_bar(stat = "identity", fill = "#B71212") +
coord_flip() +
theme_ipsum() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none"
) +
xlab("") +
labs(title = "Top Explosive Vendors") +
scale_y_continuous(labels = scales::dollar, n.breaks = 10)
gg_top_vendors_pyro
Lets try to figure out how many clusters exist. We should transform the x and y axis to make it easier to process visually.
Lets try 4 clusters
mplot3.xy(
log(tbl_vendors$count_actions),
log(tbl_vendors$amount_contracts / 1000000),
cluster = "PAM",
cluster.params = list(k = 4),
main = "Contracts Actions vs Contract $ by PAM Cluster Group",
fit = "lm",
xlab = "Contract Actions",
ylab = "Contract $ (millions)",
theme = "white"
)
Looks Good!
Now lets take a look at a normalzied view of all of the key vendor features.
tbl_vendors %>%
select(-c(id_duns)) %>%
select_if(is.numeric) %>%
preprocess(scale = T) %>%
mplot3.x(group.title = "Normalized Numeric Feature Distributions [Explosive Vendors - Over $500k in Obligations]",
theme = 'white',
density.line = T)
[2020-11-05 16:15:03 preprocess] Scaling 5 numeric features… [2020-11-05 16:15:03 preprocess] Done
First we need to build an input table with the data on the dimensions to be reduced
tbl_pyro_office <-
tbl_pyro %>%
group_by(name_office_award) %>%
summarise(
department = name_department_award[which.max(amount_obligation)],
agency = name_agency_cgac_award[which.max(amount_obligation)],
year_first = year(date_obligation) %>% min(),
date_recent_award = year(date_obligation) %>% max(),
count_actions = n(),
count_contracts = n_distinct(id_contract_analysis, na.rm = T),
amount_contracts = sum(amount_obligation),
count_distinct_vendors = n_distinct(id_duns, na.rm = T),
count_distinct_parents = n_distinct(id_duns_parent, na.rm = T),
.groups = "drop"
) %>%
arrange(desc(amount_contracts))
tbl_pyro_office <-
tbl_pyro_office %>%
mutate(agency_lumped = fct_lump(agency, n = 6, other_level = "ALL OTHER AGENCIES"))
reactable(tbl_pyro_office, filterable = F,
resizable = T,
searchable = T,
showPageSizeOptions = T,
pageSizeOptions = c(5, 10, 20),
sortable = T,
)
tbl_umap <-
tbl_pyro_office %>%
select_if(is.numeric) %>%
uwot::umap(n_neighbors = 14, metric = "manhattan") %>%
as_tibble() %>%
setNames(c("umap_001", "umap_002"))
tbl_pyro_office <-
tbl_pyro_office %>%
bind_cols(
tbl_umap
)
Finally we can visualize the output.
x <-
c(
"Department",
"Agency",
"Office",
"Contracts",
"Vendors",
"Amount $",
"Actions",
"umap 1",
"umap 2"
)
y <-
sprintf(
"{point.%s:.2f}",
c(
"department",
"agency",
"name_office_award",
"count_actions",
"count_distinct_vendors",
"amount_contracts",
"count_actions",
"umap_001",
"umap_002"
)
)
tltip <- tooltip_table(x, y)
hc_umap <-
tbl_pyro_office %>%
hchart(
"scatter",
hcaes(
x = umap_001,
y = umap_002,
group = agency_lumped,
name = name_office_award
),
marker = list(radius = 1, symbol = 'circle')
) %>%
hc_add_theme(hc_theme_darkunica()) %>%
hc_xAxis(visible = F) %>%
hc_yAxis(visible = F) %>%
hc_title(text = "Pyrotechnic Office Dimension Reduction") %>%
hc_tooltip(
useHTML = TRUE,
headerFormat = "{point.name}",
pointFormat = tltip,
table = T
)
hc_umap
Next we will explore FPDS atom, this provides the full interface into all the data contained in FPDS going back to 1978.
This provides access to significantly more data but is extremely compute heavy and exponentially slower than fpds_csv
args(fpds_atom)
function (global_vendor_name = NA, vendor_name = NA, parent_vendor_name = NA, department_name = NA, award_type = NA, research = NA, vendor_duns_number = NA, parent_duns_number = NA, vendor_doing_business_as_name = NA, agency_name = NA, contracting_office_name = NA, contracting_agency_name = NA, principal_naics_code = NA, award_status = NA, subcontract_plan = NA, solicitation_procedure = NA, contract_type = NA, contract_type_description = NA, type_of_contract_pricing = NA, contract_id = NA, ref_idv_contract_id = NA, ref_idv_agency_id = NA, contracting_agency_id = NA, contracting_office_id = NA, funding_agency_id = NA, funding_office_id = NA, funding_office_name = NA, agency_code = NA, department_id = NA, last_mod_date = NA, last_modified_by = NA, award_completion_date = NA, created_date = NA, signed_date = NA, effective_date = NA, estimated_completion_date = NA, cancellation_date = NA, destroy_date = NA, final_invoice_paid_date = NA, funded_through_date = NA, last_modified_date = NA, physical_completion_date = NA, reveal_date = NA, solicitation_issue_date = NA, sys_last_modified_date = NA, vendor_registration_date = NA, vendor_renewal_date = NA, base_exercised_options_value = NA, current_contract_value = NA, dollars_obligated = NA, contract_value = NA, fee_range_lower_value = NA, fee_range_upper_value = NA, fixed_fee_value = NA, obligated_amount = NA, total_current_contract_value = NA, total_dollars_obligated = NA, total_non_government_value = NA, total_ultimate_contract_value = NA, ultimate_contract_value = NA, contract_fiscal_year = NA, created_by = NA, description_of_requirement = NA, reason_for_modification = NA, legislative_mandates = NA, local_area_set_aside = NA, socio_economic_indicators = NA, multiyear_contract = NA, national_interest_code = NA, product_or_service_code = NA, performance_district_code = NA, performance_country = NA, performance_state_name = NA, vendor_address_city = NA, vendor_congress_district_code = NA, vendor_address_country_code = NA, vendor_address_country_name = NA, vendor_address_state_code = NA, vendor_address_state_name = NA, vendor_address_zip_code = NA, extent_competed = NA, number_of_offers_received = NA, sort_item = “Signed Date”, use_future = F, show_progress = T, clean_address = T, clean_entity_column = T, sort_descending = T, parse_contracts = F, snake_names = F, keep_key_columns = F, exclude_bloat = F, unformat = T, return_message = T, …) NULL
tbl_anduril <-
fpds_atom(
vendor_name = "ANDURIL",
parse_contracts = T,
snake_names = T
)
Lets explore the data
glimpse(tbl_anduril)
Rows: 43 Columns: 334 $ id_contract_analysis